library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.8
## ✔ tidyr   0.8.2     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(janitor)
library(plotly) # become less popular with increase in Shiny, but it creates great interactive plots
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(RColorBrewer)

# Packages for Cluster Analysis
library(NbClust)
library(cluster)
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(dendextend)
## 
## ---------------------
## Welcome to dendextend version 1.9.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
library(ggdendro)
## 
## Attaching package: 'ggdendro'
## The following object is masked from 'package:dendextend':
## 
##     theme_dendro
# Packages for text mining/sentiment analysis/word cloud
library(pdftools)
## Warning: package 'pdftools' was built under R version 3.5.2
library(tidytext)
library(wordcloud)

PART 1. k-means clustering

iris_nice <- iris %>% 
  clean_names()

ggplot(iris_nice) +
  geom_point(aes(x = petal_length, y = petal_width, color = species))

How many clusters do YOU think should exist, R?

number_est <- NbClust(iris_nice[1:4], min.nc = 2, max.nc = 10, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 10 proposed 2 as the best number of clusters 
## * 8 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 8 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
# We'll stick with 3 clusters when we perform k-means

Perform k-means clustering with 3 groups:

iris_km <- kmeans(iris_nice[1:4], 3)

iris_km$size
## [1] 62 38 50
iris_km$centers
##   sepal_length sepal_width petal_length petal_width
## 1     5.901613    2.748387     4.393548    1.433871
## 2     6.850000    3.073684     5.742105    2.071053
## 3     5.006000    3.428000     1.462000    0.246000
iris_km$cluster
##   [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [36] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [71] 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2
## [106] 2 1 2 2 2 2 2 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 1 2 2 2 2 2 1 2 2 2 2 1 2
## [141] 2 2 1 2 2 2 1 2 2 1
# iris_km will view everything

iris_cl <- data.frame(iris_nice, cluster_no = factor(iris_km$cluster))

# when you run this it will add a cluster number column to the iris dataset
# let's look at a basic ggplot

ggplot(iris_cl) +
  geom_point(aes(x = sepal_length, y = sepal_width, color = cluster_no))

# better graph
ggplot(iris_cl) + 
  geom_point(aes(x = petal_length,
                 y = petal_width, 
                 color = cluster_no,
                 pch = species)) +
  scale_color_brewer(palette = "Dark2") +
  theme_bw()

# use plotly to create a 3D rep of our clusters

plot_ly(x = iris_cl$petal_length, y = iris_cl$petal_width, z = iris_cl$sepal_width, type = "scatter3d", color = iris_cl$cluster_no, symbol = iris_cl$species, colors = "Dark2") 
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode